{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 零基础实战机器学习 源代码\n",
    "\n",
    "第8讲 特征工程-类别型特征的哑变量\n",
    "\n",
    "作者 黄佳\n",
    "\n",
    "极客时间专栏链接：https://time.geekbang.org/column/intro/438"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 导入数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>订单号</th>\n",
       "      <th>产品码</th>\n",
       "      <th>消费日期</th>\n",
       "      <th>产品说明</th>\n",
       "      <th>数量</th>\n",
       "      <th>单价</th>\n",
       "      <th>用户码</th>\n",
       "      <th>城市</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>536374</td>\n",
       "      <td>21258</td>\n",
       "      <td>6/1/2020 9:09</td>\n",
       "      <td>五彩玫瑰五支装</td>\n",
       "      <td>32</td>\n",
       "      <td>10.95</td>\n",
       "      <td>15100</td>\n",
       "      <td>北京</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>536376</td>\n",
       "      <td>22114</td>\n",
       "      <td>6/1/2020 9:32</td>\n",
       "      <td>茉莉花白色25枝</td>\n",
       "      <td>48</td>\n",
       "      <td>3.45</td>\n",
       "      <td>15291</td>\n",
       "      <td>上海</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>536376</td>\n",
       "      <td>21733</td>\n",
       "      <td>6/1/2020 9:32</td>\n",
       "      <td>教师节向日葵3枝尤加利5枝</td>\n",
       "      <td>64</td>\n",
       "      <td>2.55</td>\n",
       "      <td>15291</td>\n",
       "      <td>上海</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>536378</td>\n",
       "      <td>22386</td>\n",
       "      <td>6/1/2020 9:37</td>\n",
       "      <td>百合粉色10花苞</td>\n",
       "      <td>10</td>\n",
       "      <td>1.95</td>\n",
       "      <td>14688</td>\n",
       "      <td>北京</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>536378</td>\n",
       "      <td>85099C</td>\n",
       "      <td>6/1/2020 9:37</td>\n",
       "      <td>橙黄香槟色康乃馨</td>\n",
       "      <td>10</td>\n",
       "      <td>1.95</td>\n",
       "      <td>14688</td>\n",
       "      <td>北京</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      订单号     产品码           消费日期           产品说明  数量     单价    用户码  城市\n",
       "0  536374   21258  6/1/2020 9:09        五彩玫瑰五支装  32  10.95  15100  北京\n",
       "1  536376   22114  6/1/2020 9:32       茉莉花白色25枝  48   3.45  15291  上海\n",
       "2  536376   21733  6/1/2020 9:32  教师节向日葵3枝尤加利5枝  64   2.55  15291  上海\n",
       "3  536378   22386  6/1/2020 9:37       百合粉色10花苞  10   1.95  14688  北京\n",
       "4  536378  85099C  6/1/2020 9:37       橙黄香槟色康乃馨  10   1.95  14688  北京"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd #导入Pandas\n",
    "df_sales = pd.read_csv('易速鲜花订单记录.csv') #载入数据\n",
    "df_sales.head() #显示头几行数据 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>订单号</th>\n",
       "      <th>产品码</th>\n",
       "      <th>消费日期</th>\n",
       "      <th>产品说明</th>\n",
       "      <th>数量</th>\n",
       "      <th>单价</th>\n",
       "      <th>用户码</th>\n",
       "      <th>城市_北京</th>\n",
       "      <th>城市_广州</th>\n",
       "      <th>城市_成都</th>\n",
       "      <th>城市_深圳</th>\n",
       "      <th>城市_苏州</th>\n",
       "      <th>城市_西安</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>536374</td>\n",
       "      <td>21258</td>\n",
       "      <td>6/1/2020 9:09</td>\n",
       "      <td>五彩玫瑰五支装</td>\n",
       "      <td>32</td>\n",
       "      <td>10.95</td>\n",
       "      <td>15100</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>536376</td>\n",
       "      <td>22114</td>\n",
       "      <td>6/1/2020 9:32</td>\n",
       "      <td>茉莉花白色25枝</td>\n",
       "      <td>48</td>\n",
       "      <td>3.45</td>\n",
       "      <td>15291</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>536376</td>\n",
       "      <td>21733</td>\n",
       "      <td>6/1/2020 9:32</td>\n",
       "      <td>教师节向日葵3枝尤加利5枝</td>\n",
       "      <td>64</td>\n",
       "      <td>2.55</td>\n",
       "      <td>15291</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>536378</td>\n",
       "      <td>22386</td>\n",
       "      <td>6/1/2020 9:37</td>\n",
       "      <td>百合粉色10花苞</td>\n",
       "      <td>10</td>\n",
       "      <td>1.95</td>\n",
       "      <td>14688</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>536378</td>\n",
       "      <td>85099C</td>\n",
       "      <td>6/1/2020 9:37</td>\n",
       "      <td>橙黄香槟色康乃馨</td>\n",
       "      <td>10</td>\n",
       "      <td>1.95</td>\n",
       "      <td>14688</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>87175</th>\n",
       "      <td>581585</td>\n",
       "      <td>21684</td>\n",
       "      <td>6/9/2021 12:31</td>\n",
       "      <td>产品说明掩码</td>\n",
       "      <td>12</td>\n",
       "      <td>5.00</td>\n",
       "      <td>15804</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>87176</th>\n",
       "      <td>581585</td>\n",
       "      <td>22398</td>\n",
       "      <td>6/9/2021 12:31</td>\n",
       "      <td>产品说明掩码</td>\n",
       "      <td>12</td>\n",
       "      <td>499.00</td>\n",
       "      <td>15804</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>87177</th>\n",
       "      <td>581585</td>\n",
       "      <td>23328</td>\n",
       "      <td>6/9/2021 12:31</td>\n",
       "      <td>产品说明掩码</td>\n",
       "      <td>4</td>\n",
       "      <td>58.00</td>\n",
       "      <td>15804</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>87178</th>\n",
       "      <td>581585</td>\n",
       "      <td>23145</td>\n",
       "      <td>6/9/2021 12:31</td>\n",
       "      <td>产品说明掩码</td>\n",
       "      <td>12</td>\n",
       "      <td>88.90</td>\n",
       "      <td>15804</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>87179</th>\n",
       "      <td>581585</td>\n",
       "      <td>22466</td>\n",
       "      <td>6/9/2021 12:31</td>\n",
       "      <td>产品说明掩码</td>\n",
       "      <td>12</td>\n",
       "      <td>78.00</td>\n",
       "      <td>15804</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>87180 rows × 13 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          订单号     产品码            消费日期           产品说明  数量      单价    用户码  \\\n",
       "0      536374   21258   6/1/2020 9:09        五彩玫瑰五支装  32   10.95  15100   \n",
       "1      536376   22114   6/1/2020 9:32       茉莉花白色25枝  48    3.45  15291   \n",
       "2      536376   21733   6/1/2020 9:32  教师节向日葵3枝尤加利5枝  64    2.55  15291   \n",
       "3      536378   22386   6/1/2020 9:37       百合粉色10花苞  10    1.95  14688   \n",
       "4      536378  85099C   6/1/2020 9:37       橙黄香槟色康乃馨  10    1.95  14688   \n",
       "...       ...     ...             ...            ...  ..     ...    ...   \n",
       "87175  581585   21684  6/9/2021 12:31         产品说明掩码  12    5.00  15804   \n",
       "87176  581585   22398  6/9/2021 12:31         产品说明掩码  12  499.00  15804   \n",
       "87177  581585   23328  6/9/2021 12:31         产品说明掩码   4   58.00  15804   \n",
       "87178  581585   23145  6/9/2021 12:31         产品说明掩码  12   88.90  15804   \n",
       "87179  581585   22466  6/9/2021 12:31         产品说明掩码  12   78.00  15804   \n",
       "\n",
       "       城市_北京  城市_广州  城市_成都  城市_深圳  城市_苏州  城市_西安  \n",
       "0          1      0      0      0      0      0  \n",
       "1          0      0      0      0      0      0  \n",
       "2          0      0      0      0      0      0  \n",
       "3          1      0      0      0      0      0  \n",
       "4          1      0      0      0      0      0  \n",
       "...      ...    ...    ...    ...    ...    ...  \n",
       "87175      0      0      0      1      0      0  \n",
       "87176      0      0      0      1      0      0  \n",
       "87177      0      0      0      1      0      0  \n",
       "87178      0      0      0      1      0      0  \n",
       "87179      0      0      0      1      0      0  \n",
       "\n",
       "[87180 rows x 13 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 把多分类字段转换为二分类哑编码 \n",
    "category_features = ['城市'] #要转换的特征列表\n",
    "df_sales = pd.get_dummies(df_sales, #创建哑变量\n",
    "                          drop_first=True, \n",
    "                          columns=category_features) \n",
    "# df_sales = pd.get_dummies(df_sales, \n",
    "#                           columns=category_features) #创建哑变量\n",
    "df_sales #显示数据"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
